## This file merges together all datasets to create one final dataset for analysis ##
## Note that you must first obtain access to Dave Leip's datasets, ##
## by downloading proprietary files and running code in cleaning_leip_data.R. ##
## Otherwise, this code will produce a dataset that contains all variables except county-year turnout ##

## Created by Meredith Dost and last run 8/18/2025 ##

# load in packages
library(tidyverse)
library(tidyr)
library(readxl)
library(scales)

# set working directory
#setwd("")

### merge ###
data <- read.csv("voting_data/demvoteshare_by_county.csv")

### subset to exclude ALASKA, since their districts/counties are unusual and don't align perfectly across datasets ###
data <- data[!(data$fips>=2701 & data$fips<=2740),]

## sahie data
sahie <- read.csv("demographic_data/sahie_county_data.csv")
data <- merge(data, sahie, by = c("fips","year"))
rm(sahie)

## acs 2010-2020 vars by county ##
acs <- read.csv("demographic_data/acs_county_data.csv")
data <- merge(data, acs, by = c("fips","year"))
rm(acs)

## getting state to county fips xwalk
xwalk <- read.csv("other_data/state_ctyfips_xwalk.csv")
data <- merge(data, xwalk, by = "fips")
rm(xwalk)

## gubernatorial and senate races in midterm years ##
races <- read.csv("other_data/states_gub_sen_elections_10_14_18.csv")
st_ab <- races[,c(1,2)]
races$fips_state <- NULL
races$State <- NULL
races10 <- races[,c(1,2,3)]
races10$year <- 2010
names(races10)[c(2,3)] <- c("gub","sen")
races14 <- races[,c(1,4,5)]
races14$year <- 2014
names(races14)[c(2,3)] <- c("gub","sen")
races18 <- races[,c(1,6,7)]
races18$year <- 2018
names(races18)[c(2,3)] <- c("gub","sen")
races_comb <- rbind.data.frame(races10,races14,races18)
races_comb <- races_comb[which(races_comb$state!="AK"),]
data <- merge(data, races_comb, by = c("state","year"), all=T)
data$gub[is.na(data$gub)] <- 0
data$sen[is.na(data$sen)] <- 0
rm(races_comb,races10,races14,races18)

## state name/abbreviations
data <- merge(data, st_ab, by = "state")
rm(races)

## creating midterm dummy ##
mid <- data.frame(matrix(c(unique(data$year),
                           c(1,0,1,0,1,0)), 
                         nrow=length(unique(data$year)), ncol=2))
colnames(mid) <- c("year","midterm")
data <- merge(data, mid, by = c("year"))
rm(mid)

## bringing in Medicaid expansion status/implementation dates ## LONG version
exp <- read.csv("other_data/Medicaid_expansion_status.csv")
#2014
exp14 <- exp
exp14$exp <- ifelse(exp14$year_implemented==2014, 1, 0)
exp14$exp[is.na(exp14$year_implemented)] <- 0
exp14$year <- 2014
exp14 <- exp14[c("State","year","exp")]
#2016
exp16 <- exp
exp16$exp <- ifelse(exp16$year_implemented>=2014 & exp16$year_implemented<=2016, 1, 0)
exp16$exp[is.na(exp16$year_implemented)] <- 0
exp16$year <- 2016
exp16 <- exp16[c("State","year","exp")]
#2018
exp18 <- exp
exp18$exp <- ifelse(exp18$year_implemented>=2014 & exp18$year_implemented<=2018, 1, 0)
exp18$exp[is.na(exp18$year_implemented)] <- 0
exp18$year <- 2018
exp18 <- exp18[c("State","year","exp")]
#2020
exp20 <- exp
exp20$exp <- ifelse(exp20$year_implemented>=2014 & exp20$year_implemented<=2020, 1, 0)
exp20$exp[is.na(exp20$year_implemented)] <- 0
exp20$year <- 2020
exp20 <- exp20[c("State","year","exp")]
#2012
exp12 <- exp
exp12$exp <- 0
exp12$year <- 2012
exp12 <- exp12[c("State","year","exp")]
#2010
exp10 <- exp
exp10$exp <- 0
exp10$year <- 2010
exp10 <- exp10[c("State","year","exp")]
#merging together
rm(exp)
exp <- rbind.data.frame(exp10,exp12,exp14,exp16,exp18,exp20)
data <- merge(data, exp, by = c("State","year"))
rm(exp,exp10,exp12,exp14,exp16,exp18,exp20)

# creating swing state 2012/2016 dataset
swing <- data.frame(st_ab$state)
names(swing) <- "state"
# swing state status 2016
swing16 <- swing
swing16$swing <- 0
swing16$swing[swing16$state=="FL"|swing16$state=="IA"|swing16$state=="WI"|swing16$state=="MI"|swing16$state=="OH"|swing16$state=="PA"|swing16$state=="ME"|
                swing16$state=="WI"|swing16$state=="NH"|swing16$state=="AZ"|swing16$state=="VA"|swing16$state=="NV"|swing16$state=="CO"|swing16$state=="NC"] <- 1
swing16$year <- 2018
swing16_2 <- swing16
swing16_2$year <- 2020
# swing state status 2012
swing12 <- swing
swing12$swing <- 0
swing12$swing[swing12$state=="FL"|swing12$state=="IA"|swing12$state=="WI"|swing12$state=="OH"|swing12$state=="CO"|swing12$state=="VA"|swing12$state=="NH"|
                swing12$state=="NV"|swing12$state=="NC"] <- 1
swing12$year <- 2014
swing12_2 <- swing12
swing12_2$year <- 2016
# swing state status 2008
swing08 <- swing
swing08$swing <- 0
swing08$swing[swing08$state=="FL"|swing08$state=="IN"|swing08$state=="MO"|swing08$state=="OH"|swing08$state=="CO"|swing08$state=="VA"|swing08$state=="NH"|
                swing08$state=="NV"|swing08$state=="NC"|swing08$state=="PA"] <- 1
swing08$year <- 2010
swing08_2 <- swing08
swing08_2$year <- 2012
# merge swing together
sw <- rbind.data.frame(swing08,swing08_2,swing12,swing12_2,swing16,swing16_2)
data <- merge(data, sw, by = c("state","year"))
rm(st_ab,swing08,swing08_2,swing12,swing12_2,swing16,swing16_2,swing,sw)

## read in Medicaid burden measures
kmerg <- read.csv("burden_data/medburden_measures.csv")
data <- merge(data, kmerg, by = c("State","year"))
rm(kmerg)

## voting burden ##
vburd <- read.csv("burden_data/electburden_measures.csv")
data <- merge(data, vburd, by = c("state","year"))
rm(vburd)

# merging in distance from border
dist <- read.csv("other_data/border_distance.csv")
dataw_border <- merge(data,dist, by = "fips", all.x=T)

#cleaning up R environment
rm(list=ls()[!ls() %in% c("dataw_border")])


###### final cleaning ######
dataw_border$MINDIST[dataw_border$state=="HI"] <- 999
## creating population weight for robustness checks
dataw_border$lpop <- log(dataw_border$pop_total)

###### creating DV ######
## read in leip dataset, created by downloading proprietary files and running code in cleaning_leip_data.R
# leip <- read.csv("voting_data/leip_turnout_county.csv")
# data <- merge(dataw_border, leip, by = c("fips","year"))
# ## turnout ##
# data$turn <- data$turnout/data$cvap*100
# data$turn[data$turn>100] <- 100
# dataw_border <- data

## get rid of observations w/NAs, except for FinalCOVI, which is by definition NA for all of 2010
dataw_border <- dataw_border[complete.cases(dataw_border[,c(1:26,28:ncol(dataw_border))]),]

## save out dataset
write.table(dataw_border, "final_dataset_for_analysis_replication.csv", row.names=F, sep=",")
